#Importing relevant libraries
import polars as pl
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skewnorm, norm
import warnings
warnings.simplefilter(action='ignore')
from darts import TimeSeries
from darts.models import *
from darts.dataprocessing.transformers import Scaler
from darts.metrics import *
from darts.utils.statistics import check_seasonality
# Load the data
df = pl.read_csv("SnP500.csv")
# Data Summary
df.describe()
| statistic | Date | Close | Volume | Inflation | Unemployment | GDP_Growth_Rate | GDP | Interest_rate | CPI |
|---|---|---|---|---|---|---|---|---|---|
| str | str | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
| "count" | "7555" | 7555.0 | 7555.0 | 7555.0 | 7555.0 | 7555.0 | 7555.0 | 7555.0 | 7555.0 |
| "null_count" | "0" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| "mean" | null | 1616.111603 | 2.6369e9 | 2.15176 | 5.752548 | 2.463843 | 1.4420e13 | 3.317614 | 2.489871 |
| "std" | null | 977.093422 | 1.7951e9 | 1.168099 | 1.627849 | 1.824953 | 5.0062e12 | 2.132803 | 1.441926 |
| "min" | "1/10/1993" | 429.049988 | 1.499e7 | 0.640955 | 3.65 | -2.767803 | 6.8600e12 | -1.189357 | -0.355546 |
| "25%" | null | 1029.030029 | 9.81e8 | 1.558531 | 4.62 | 1.841875 | 1.0300e13 | 2.023885 | 1.622223 |
| "50%" | null | 1301.349976 | 2.8136e9 | 1.89961 | 5.45 | 2.70637 | 1.4500e13 | 2.960506 | 2.33769 |
| "75%" | null | 2050.629883 | 3.8921e9 | 2.37034 | 6.17 | 3.772565 | 1.8200e13 | 4.89831 | 2.951657 |
| "max" | "9/9/2022" | 4796.560059 | 1.1456e10 | 7.005276 | 9.63 | 5.945485 | 2.5500e13 | 7.148178 | 8.0028 |
# Data Peek
df.head()
| Date | Close | Volume | Inflation | Unemployment | GDP_Growth_Rate | GDP | Interest_rate | CPI |
|---|---|---|---|---|---|---|---|---|
| str | f64 | i64 | f64 | f64 | f64 | f64 | f64 | f64 |
| "4/1/1993" | 435.380005 | 201210000 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
| "5/1/1993" | 434.339996 | 240350000 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
| "6/1/1993" | 434.519989 | 295240000 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
| "7/1/1993" | 430.730011 | 304850000 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
| "8/1/1993" | 429.049988 | 263470000 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
#Checking on the date patterns
df["Date"].value_counts
<bound method Series.value_counts of shape: (7_555,) Series: 'Date' [str] [ "4/1/1993" "5/1/1993" "6/1/1993" "7/1/1993" "8/1/1993" … "23-12-2022" "27-12-2022" "28-12-2022" "29-12-2022" "30-12-2022" ]>
# Detect all unique delimiters in date strings as we see some different patterns w.r.t to delimiters
delimiters = {char for date in df["Date"] for char in date if not char.isdigit()}
delimiters
{'-', '/'}
# Function to normalize date formats -- Strictly w.r.t this data
def normalize_date(date_str):
for fmt in ("%d-%m-%Y", "%d/%m/%Y"):
try:
return datetime.strptime(date_str, fmt).strftime("%d-%m-%Y")
except ValueError:
continue
raise ValueError(f"Date format for {date_str} not recognized")
# Common method to preprocess the DataFrame as per the current data used
def process_dataframe(df):
# Apply the normalization function to the Date column
df1 = df.with_columns(pl.col("Date").map_elements(normalize_date))
# Convert the 'Date' column to datetime
df1 = df1.with_columns(pl.col("Date").str.strptime(pl.Date))
# Extract year and quarter from the date and create 'ds' column
df1 = df1.with_columns((pl.col("Date").dt.year().cast(str) + 'Q' + pl.col("Date").dt.quarter().cast(str)).alias("ds"))
# Group by 'ds' and calculate the mean for each group for all columns except 'ds'
agg_exprs = [pl.col(col).mean().alias(f"meanQ_{col}") for col in df1.columns if col != "ds"]
quarterly_df = df1.groupby("ds").agg(agg_exprs)
# Convert to Pandas DataFrame
quarterly_df_pandas = quarterly_df.to_pandas()
# Convert 'ds' to datetime and set as index
quarterly_df_pandas['ds'] = pd.to_datetime(quarterly_df_pandas['ds'])
quarterly_df_pandas = quarterly_df_pandas.set_index('ds').sort_index()
# Drop the 'mean_Date' column
quarterly_df_pandas = quarterly_df_pandas.drop(columns=["meanQ_Date"])
return quarterly_df_pandas
# Process the DataFrame
data = process_dataframe(df)
data
| meanQ_Close | meanQ_Volume | meanQ_Inflation | meanQ_Unemployment | meanQ_GDP_Growth_Rate | meanQ_GDP | meanQ_Interest_rate | meanQ_CPI | |
|---|---|---|---|---|---|---|---|---|
| ds | ||||||||
| 1993-01-01 | 442.750321 | 2.659718e+08 | 2.370340 | 6.90 | 2.751781 | 6.860000e+12 | 3.545617 | 2.951657 |
| 1993-04-01 | 445.505872 | 2.620033e+08 | 2.370340 | 6.90 | 2.751781 | 6.860000e+12 | 3.545617 | 2.951657 |
| 1993-07-01 | 453.558748 | 2.557414e+08 | 2.370340 | 6.90 | 2.751781 | 6.860000e+12 | 3.545617 | 2.951657 |
| 1993-10-01 | 464.271874 | 2.751294e+08 | 2.370340 | 6.90 | 2.751781 | 6.860000e+12 | 3.545617 | 2.951657 |
| 1994-01-01 | 469.213492 | 3.127857e+08 | 2.135424 | 6.12 | 4.028793 | 7.290000e+12 | 4.898310 | 2.607442 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2021-10-01 | 4602.108894 | 4.082385e+09 | 4.492792 | 5.35 | 5.945485 | 2.330000e+13 | -1.189357 | 4.697859 |
| 2022-01-01 | 4463.855477 | 5.028659e+09 | 7.005276 | 3.65 | 2.061593 | 2.550000e+13 | 0.000000 | 8.002800 |
| 2022-04-01 | 4105.667102 | 4.924918e+09 | 7.005276 | 3.65 | 2.061593 | 2.550000e+13 | 0.000000 | 8.002800 |
| 2022-07-01 | 3980.351112 | 4.190339e+09 | 7.005276 | 3.65 | 2.061593 | 2.550000e+13 | 0.000000 | 8.002800 |
| 2022-10-01 | 3851.973501 | 4.345159e+09 | 7.005276 | 3.65 | 2.061593 | 2.550000e+13 | 0.000000 | 8.002800 |
120 rows × 8 columns
data.columns
Index(['meanQ_Close', 'meanQ_Volume', 'meanQ_Inflation', 'meanQ_Unemployment',
'meanQ_GDP_Growth_Rate', 'meanQ_GDP', 'meanQ_Interest_rate',
'meanQ_CPI'],
dtype='object')
# Plot using Plotly -- kindly select the variables individually in the right legend to have a better visualization.
fig = px.line(data, x= data.index,
y=data.columns, title='Time Series Data')
fig.show()
#Distplot and Boxplot for each feature - Data distribution and Outlier Detection
plt.figure(figsize=[20,60])
columns = data.columns
cnt = 1
for col in columns:
plt.subplot(14, 2, cnt)
sns.distplot(data[col], fit=norm)
cnt += 1
plt.subplot(14, 2, cnt)
sns.boxplot(data[col])
cnt += 1
plt.tight_layout()
plt.show()
1) Features doesn't seem to align close with normal distribution except for CPI. 2) Close, Inflation, Unemployment, GDP Growth Rate and CPI are having outliers.
# Outlier Treatment - Values below the lower bound are replaced with the lower bound,
# and values above the upper bound are replaced with the upper bound.
# Function to detect and treat outliers using IQR method
def treat_outliers(df, column):
Q1 = df[column].quantile(0.25) # Q1 is the 25th percentile, and
Q3 = df[column].quantile(0.75) # Q3 is the 75th percentile of the data.
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
return df
# List of columns to treat for outliers
columns_to_treat = ['meanQ_Close', 'meanQ_Inflation', 'meanQ_Unemployment', 'meanQ_GDP_Growth_Rate', 'meanQ_CPI']
# Apply outlier treatment to each column
for column in columns_to_treat:
data_treated = treat_outliers(data, column)
# Calculate the correlation matrix
correlation_matrix = data_treated.corr()
# Create a heatmap
fig_corr = go.Figure(data=go.Heatmap(
z=correlation_matrix.values,
x=correlation_matrix.columns,
y=correlation_matrix.columns,
colorscale='Viridis'))
fig_corr.update_layout(title='Correlation Heatmap')
fig_corr.show()
# Drop the target variable
independent_variables = data_treated.drop(columns=['meanQ_Unemployment'])
# Calculate VIF for each independent variable
# A high VIF value (typically greater than 10) indicates that the variance
# of the coefficient estimate for that variable is inflated due to multicollinearity.
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(independent_variables.values, i) for i in range(independent_variables.shape[1])]
vif["features"] = independent_variables.columns
vif
| VIF Factor | features | |
|---|---|---|
| 0 | 8.460349 | meanQ_Close |
| 1 | 8.136021 | meanQ_Volume |
| 2 | 5.882536 | meanQ_Inflation |
| 3 | 1.787589 | meanQ_GDP_Growth_Rate |
| 4 | 158.537938 | meanQ_GDP |
| 5 | 1.025189 | meanQ_Interest_rate |
| 6 | 5.205215 | meanQ_CPI |
We observe GDP is having excessively high vif value, will eliminate this as it might create noise in the model. Besides, we also observe Close and Volume being on higher and similar vif value, if it would have been greater than 10(threshold), then would have considered only one of them in the model; same goes with CPI and Inflation.
Kindly note this model building is done for multivariate time series analysis where target variable is Unemployment and features are rest macroeconomic parameters.
# Function to create TimeSeries objects, scale data, split into train/test, fit models, and calculate metrics
def evaluate_models(data, target_col, feature_cols, models):
# Create TimeSeries object with all features
target_series = TimeSeries.from_dataframe(data, value_cols=target_col)
feature_series = TimeSeries.from_dataframe(data, value_cols=feature_cols)
# Ensure past covariates start from the earliest possible date
feature_series = feature_series.slice_intersect(target_series)
# Scale the data
scaler = Scaler()
target_scaled_series = scaler.fit_transform(target_series)
feature_scaled_series = scaler.fit_transform(feature_series)
# Split the data into train and test sets
train_target, test_target = target_scaled_series.split_before(0.8)
train_features, test_features = feature_scaled_series.split_before(0.8)
metrics = {
'Model': [],
'sMAPE': [],
'RMSE': []
}
predictions = {}
for model_name, model in models.items():
# Fit model
model.fit(train_target, past_covariates=train_features)
# Make predictions
pred = model.predict(len(test_target))
# Store predictions
predictions[model_name] = pred
# Calculate validation metrics
metrics['Model'].append(model_name)
metrics['sMAPE'].append(smape(test_target, pred))
metrics['RMSE'].append(rmse(test_target, pred))
# Create a DataFrame for the metrics
metrics_df = pd.DataFrame(metrics)
return metrics_df, predictions, target_scaled_series
# Define models
models = {
'NHiTS': NHiTSModel(input_chunk_length=48, output_chunk_length=36),
'TiDE': TiDEModel(input_chunk_length=48, output_chunk_length=36),
'NBEATS': NBEATSModel(input_chunk_length=48, output_chunk_length=36)
}
# Evaluate models
metrics_df1, predictions1, target_scaled_series1 = evaluate_models(data_treated, 'meanQ_Unemployment',
['meanQ_Close', 'meanQ_Volume', 'meanQ_Inflation', 'meanQ_GDP_Growth_Rate', 'meanQ_Interest_rate', 'meanQ_CPI'],
models)
metrics_df1
GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs | Name | Type | Params | Mode ------------------------------------------------------------- 0 | criterion | MSELoss | 0 | train 1 | train_criterion | MSELoss | 0 | train 2 | val_criterion | MSELoss | 0 | train 3 | train_metrics | MetricCollection | 0 | train 4 | val_metrics | MetricCollection | 0 | train 5 | stacks | ModuleList | 1.4 M | train ------------------------------------------------------------- 1.2 M Trainable params 172 K Non-trainable params 1.4 M Total params 5.621 Total estimated model params size (MB)
Training: | | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached. GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs
Predicting: | | 0/? [00:00<…
GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs | Name | Type | Params | Mode ----------------------------------------------------------------- 0 | criterion | MSELoss | 0 | train 1 | train_criterion | MSELoss | 0 | train 2 | val_criterion | MSELoss | 0 | train 3 | train_metrics | MetricCollection | 0 | train 4 | val_metrics | MetricCollection | 0 | train 5 | past_cov_projection | _ResidualBlock | 1.4 K | train 6 | encoders | Sequential | 78.2 K | train 7 | decoders | Sequential | 165 K | train 8 | temporal_decoder | _ResidualBlock | 594 | train 9 | lookback_skip | Linear | 1.8 K | train ----------------------------------------------------------------- 247 K Trainable params 0 Non-trainable params 247 K Total params 0.989 Total estimated model params size (MB)
Training: | | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached. GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs
Predicting: | | 0/? [00:00<…
GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs | Name | Type | Params | Mode ------------------------------------------------------------- 0 | criterion | MSELoss | 0 | train 1 | train_criterion | MSELoss | 0 | train 2 | val_criterion | MSELoss | 0 | train 3 | train_metrics | MetricCollection | 0 | train 4 | val_metrics | MetricCollection | 0 | train 5 | stacks | ModuleList | 8.7 M | train ------------------------------------------------------------- 8.7 M Trainable params 3.3 K Non-trainable params 8.7 M Total params 34.770 Total estimated model params size (MB)
Training: | | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached. GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs
Predicting: | | 0/? [00:00<…
| Model | sMAPE | RMSE | |
|---|---|---|---|
| 0 | NHiTS | 113.325883 | 0.502218 |
| 1 | TiDE | 128.925258 | 0.821985 |
| 2 | NBEATS | 118.022760 | 0.559883 |
# Convert TimeSeries to pandas DataFrame for plotting
actual_scaled_df1 = target_scaled_series1.pd_dataframe()
predicted_df1 = predictions1['NHiTS'].pd_dataframe() #Chosen NHiTS as it has minm RMSE score and is the best model among rest.
# Create traces
fig = go.Figure()
# Add actual data trace
for col in actual_scaled_df1.columns:
fig.add_trace(go.Scatter(x=actual_scaled_df1.index, y=actual_scaled_df1[col], mode='lines', name=f'Actual {col}'))
# Add predicted data trace
for col in predicted_df1.columns:
fig.add_trace(go.Scatter(x=predicted_df1.index, y=predicted_df1[col], mode='lines', name=f'Predicted {col}'))
# Update layout
fig.update_layout(
title="Actual vs Predicted Time Series Analysis - Treated",
xaxis_title="Date",
yaxis_title="Value",
legend_title="Legend",
width=1000,
height=600
)
fig.show()
# Rerun "evaluate_models" method again before running this cell otherwise it will error out.
# Evaluate models
metrics_df2, predictions2, target_scaled_series2 = evaluate_models(data, 'meanQ_Unemployment',
['meanQ_Close', 'meanQ_Volume', 'meanQ_Inflation', 'meanQ_GDP_Growth_Rate','meanQ_GDP', 'meanQ_Interest_rate', 'meanQ_CPI'],
models)
metrics_df2
GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs | Name | Type | Params | Mode ------------------------------------------------------------- 0 | criterion | MSELoss | 0 | train 1 | train_criterion | MSELoss | 0 | train 2 | val_criterion | MSELoss | 0 | train 3 | train_metrics | MetricCollection | 0 | train 4 | val_metrics | MetricCollection | 0 | train 5 | stacks | ModuleList | 1.5 M | train ------------------------------------------------------------- 1.3 M Trainable params 196 K Non-trainable params 1.5 M Total params 5.973 Total estimated model params size (MB)
Training: | | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached. GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs
Predicting: | | 0/? [00:00<…
GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs | Name | Type | Params | Mode ----------------------------------------------------------------- 0 | criterion | MSELoss | 0 | train 1 | train_criterion | MSELoss | 0 | train 2 | val_criterion | MSELoss | 0 | train 3 | train_metrics | MetricCollection | 0 | train 4 | val_metrics | MetricCollection | 0 | train 5 | past_cov_projection | _ResidualBlock | 1.6 K | train 6 | encoders | Sequential | 78.2 K | train 7 | decoders | Sequential | 165 K | train 8 | temporal_decoder | _ResidualBlock | 594 | train 9 | lookback_skip | Linear | 1.8 K | train ----------------------------------------------------------------- 247 K Trainable params 0 Non-trainable params 247 K Total params 0.989 Total estimated model params size (MB)
Training: | | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached. GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs
Predicting: | | 0/? [00:00<…
GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs | Name | Type | Params | Mode ------------------------------------------------------------- 0 | criterion | MSELoss | 0 | train 1 | train_criterion | MSELoss | 0 | train 2 | val_criterion | MSELoss | 0 | train 3 | train_metrics | MetricCollection | 0 | train 4 | val_metrics | MetricCollection | 0 | train 5 | stacks | ModuleList | 9.1 M | train ------------------------------------------------------------- 9.1 M Trainable params 3.6 K Non-trainable params 9.1 M Total params 36.305 Total estimated model params size (MB)
Training: | | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached. GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs
Predicting: | | 0/? [00:00<…
| Model | sMAPE | RMSE | |
|---|---|---|---|
| 0 | NHiTS | 115.131707 | 0.518734 |
| 1 | TiDE | 127.613580 | 0.600994 |
| 2 | NBEATS | 118.307593 | 0.588103 |
# Convert TimeSeries to pandas DataFrame for plotting
actual_scaled_df2 = target_scaled_series2.pd_dataframe()
predicted_df2 = predictions2['NHiTS'].pd_dataframe() #Chosen NHiTS as it has minm RMSE score and is the best model among rest.
# Create traces
fig = go.Figure()
# Add actual data trace
for col in actual_scaled_df2.columns:
fig.add_trace(go.Scatter(x=actual_scaled_df2.index, y=actual_scaled_df2[col], mode='lines', name=f'Actual {col}'))
# Add predicted data trace
for col in predicted_df2.columns:
fig.add_trace(go.Scatter(x=predicted_df2.index, y=predicted_df2[col], mode='lines', name=f'Predicted {col}'))
# Update layout
fig.update_layout(
title="Actual vs Predicted Time Series Analysis-Raw",
xaxis_title="Date",
yaxis_title="Value",
legend_title="Legend",
width=1000,
height=600
)
fig.show()
We get similar results here but post treatment, the errors are reduced so Outlier treatment and addressing multicollinaerity can play significant role.
# Select relevant columns for the analysis
series = data['meanQ_Unemployment']
# Create a TimeSeries object
timeseries = TimeSeries.from_dataframe(data, value_cols='meanQ_Unemployment')
# Scale the data
scaler = Scaler()
scaled_series = scaler.fit_transform(timeseries)
# Split the data into train and test sets
train, test = scaled_series.split_before(0.8)
# Define models
models = {
"NHiTS": NHiTSModel(input_chunk_length=48, output_chunk_length=36),
"NBEATS": NBEATSModel(input_chunk_length=48, output_chunk_length=36),
"TiDE": TiDEModel(input_chunk_length=48, output_chunk_length=36)
}
# DataFrame to store the results
results_df = pd.DataFrame(columns=['Models', 'sMAPE', 'RMSE'])
predictions = {}
# Loop over models
for model_name, model in models.items():
# Fit the model
model.fit(train)
# Make predictions
pred = model.predict(len(test))
# Store predictions
predictions[model_name] = pred
# Calculate metrics
model_smape = smape(test, pred)
model_rmse = rmse(test, pred)
# Append results to DataFrame
results_df = results_df.append({
'Models': model_name,
'sMAPE': model_smape,
'RMSE': model_rmse
}, ignore_index=True)
results_df
GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs | Name | Type | Params | Mode ------------------------------------------------------------- 0 | criterion | MSELoss | 0 | train 1 | train_criterion | MSELoss | 0 | train 2 | val_criterion | MSELoss | 0 | train 3 | train_metrics | MetricCollection | 0 | train 4 | val_metrics | MetricCollection | 0 | train 5 | stacks | ModuleList | 877 K | train ------------------------------------------------------------- 852 K Trainable params 24.6 K Non-trainable params 877 K Total params 3.509 Total estimated model params size (MB)
Training: | | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached. GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs
Predicting: | | 0/? [00:00<…
GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs | Name | Type | Params | Mode ------------------------------------------------------------- 0 | criterion | MSELoss | 0 | train 1 | train_criterion | MSELoss | 0 | train 2 | val_criterion | MSELoss | 0 | train 3 | train_metrics | MetricCollection | 0 | train 4 | val_metrics | MetricCollection | 0 | train 5 | stacks | ModuleList | 6.4 M | train ------------------------------------------------------------- 6.4 M Trainable params 1.6 K Non-trainable params 6.4 M Total params 25.559 Total estimated model params size (MB)
Training: | | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached. GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs
Predicting: | | 0/? [00:00<…
GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs | Name | Type | Params | Mode -------------------------------------------------------------- 0 | criterion | MSELoss | 0 | train 1 | train_criterion | MSELoss | 0 | train 2 | val_criterion | MSELoss | 0 | train 3 | train_metrics | MetricCollection | 0 | train 4 | val_metrics | MetricCollection | 0 | train 5 | encoders | Sequential | 29.1 K | train 6 | decoders | Sequential | 165 K | train 7 | temporal_decoder | _ResidualBlock | 594 | train 8 | lookback_skip | Linear | 1.8 K | train -------------------------------------------------------------- 196 K Trainable params 0 Non-trainable params 196 K Total params 0.786 Total estimated model params size (MB)
Training: | | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached. GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs
Predicting: | | 0/? [00:00<…
| Models | sMAPE | RMSE | |
|---|---|---|---|
| 0 | NHiTS | 135.284882 | 0.866495 |
| 1 | NBEATS | 129.268787 | 0.790406 |
| 2 | TiDE | 139.363406 | 1.106389 |
# Convert TimeSeries to pandas DataFrame for plotting
actual_scaled_df3 = scaled_series.pd_dataframe()
predicted_df3 = predictions['NHiTS'].pd_dataframe() #Chosen NHiTS as it has minm RMSE score and is the best model among rest.
# Create traces
fig = go.Figure()
# Add actual data trace
for col in actual_scaled_df3.columns:
fig.add_trace(go.Scatter(x=actual_scaled_df3.index, y=actual_scaled_df3[col], mode='lines', name=f'Actual {col}'))
# Add predicted data trace
for col in predicted_df3.columns:
fig.add_trace(go.Scatter(x=predicted_df3.index, y=predicted_df3[col], mode='lines', name=f'Predicted {col}'))
# Update layout
fig.update_layout(
title="Actual vs Predicted Univariate Time Series Analysis",
xaxis_title="Date",
yaxis_title="Value",
legend_title="Legend",
width=1000,
height=600
)
fig.show()
With the same parameters, tried univariate analysis, errors seem to be on higher end when compared to the multivariate one. Models work better when past_covariates are considered.
1) Any given time series data, 'date' format should be checked thouroughly incase there are different patterns either in the date format or date delimiter. 2) If there are multiple outliers across different columns, it should be treated. 3) If heatmap show multiple correlations among independent variables, then we should consider checking multicollinearity via VIF, if vif>10, and similar vif value shown between two variables, only one feature should be retained. 4) Model validation metrics shows errors are reduced when data is addressed with outlier treatment and multicollinearity. 5) very important - Experimentation with input_chunk_length and output_chunk_length parameters in the model.
a) n < output_chunk_length
b) When values of input_chunk_length and output_chunk_length are increased, model predict better compare to when less values of input_chunk_length and output_chunk_length in multivariate TSA and its vice versa in univariate TSA.